Here we do exploratory data analysis on HDMA data obtained for Pennsylvania in the year 2015. We will start from looking at the data superficially and then diving into columns of interest. Then we see for any missing values and handle them. Lets get started with the steps.
# https://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
list_of_packages <- c("mlbench", "corrplot", "rvest", "tidyr", "stringr", "dplyr", "lubridate", "data.table", "mice", "scales", "naniar", "rpart", "rpart.plot", "caret", "moments")
new.packages <- list_of_packages[!(list_of_packages %in% installed.packages()[,"Package"])]
if (length(new.packages)) {
print("Installing packages\n")
install.packages(new.packages())
}
library(corrplot)
library(ggplot2)
library(tidyr)
library(stringr)
library(dplyr)
library(data.table)
library(mice)
library(rstudioapi)
library(moments)
library(naniar)
source(paste(dirname(dirname(dirname(rstudioapi::getActiveDocumentContext()$path))), "utils/utils.r", sep="/"))
source(paste(dirname(dirname(dirname(rstudioapi::getActiveDocumentContext()$path))), "utils/model_utils.r", sep="/"))
data_dir <- "/Users/omkarpawar/Desktop/Data/PA/"
hmda_data_pa <- fread(paste(data_dir, "hmda_2015_pa_all-records_labels.csv", sep = ""))
|--------------------------------------------------|
|==================================================|
Lets see first few rows of our data and what they tell about the application.
hmda_data_pa_df <- as.data.frame(hmda_data_pa)
# Filter to include conventional loans only.
hmda_data_pa_df <- hmda_data_pa_df[hmda_data_pa_df$loan_type == "1", ]
colnames(hmda_data_pa_df)
[1] "as_of_year" "respondent_id"
[3] "agency_name" "agency_abbr"
[5] "agency_code" "loan_type_name"
[7] "loan_type" "property_type_name"
[9] "property_type" "loan_purpose_name"
[11] "loan_purpose" "owner_occupancy_name"
[13] "owner_occupancy" "loan_amount_000s"
[15] "preapproval_name" "preapproval"
[17] "action_taken_name" "action_taken"
[19] "msamd_name" "msamd"
[21] "state_name" "state_abbr"
[23] "state_code" "county_name"
[25] "county_code" "census_tract_number"
[27] "applicant_ethnicity_name" "applicant_ethnicity"
[29] "co_applicant_ethnicity_name" "co_applicant_ethnicity"
[31] "applicant_race_name_1" "applicant_race_1"
[33] "applicant_race_name_2" "applicant_race_2"
[35] "applicant_race_name_3" "applicant_race_3"
[37] "applicant_race_name_4" "applicant_race_4"
[39] "applicant_race_name_5" "applicant_race_5"
[41] "co_applicant_race_name_1" "co_applicant_race_1"
[43] "co_applicant_race_name_2" "co_applicant_race_2"
[45] "co_applicant_race_name_3" "co_applicant_race_3"
[47] "co_applicant_race_name_4" "co_applicant_race_4"
[49] "co_applicant_race_name_5" "co_applicant_race_5"
[51] "applicant_sex_name" "applicant_sex"
[53] "co_applicant_sex_name" "co_applicant_sex"
[55] "applicant_income_000s" "purchaser_type_name"
[57] "purchaser_type" "denial_reason_name_1"
[59] "denial_reason_1" "denial_reason_name_2"
[61] "denial_reason_2" "denial_reason_name_3"
[63] "denial_reason_3" "rate_spread"
[65] "hoepa_status_name" "hoepa_status"
[67] "lien_status_name" "lien_status"
[69] "edit_status_name" "edit_status"
[71] "sequence_number" "population"
[73] "minority_population" "hud_median_family_income"
[75] "tract_to_msamd_income" "number_of_owner_occupied_units"
[77] "number_of_1_to_4_family_units" "application_date_indicator"
writeLines("")
head(hmda_data_pa_df, 10)
NA
dim(hmda_data_pa_df)
[1] 341995 78
writeLines("Glimpse of hmda dataset for PA")
Glimpse of hmda dataset for PA
glimpse(hmda_data_pa_df)
Observations: 341,995
Variables: 78
$ as_of_year [3m[38;5;246m<int>[39m[23m 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, …
$ respondent_id [3m[38;5;246m<chr>[39m[23m "0000451965", "0000020861", "0000005599", "0000722777", "000…
$ agency_name [3m[38;5;246m<chr>[39m[23m "Consumer Financial Protection Bureau", "National Credit Uni…
$ agency_abbr [3m[38;5;246m<chr>[39m[23m "CFPB", "NCUA", "OCC", "CFPB", "CFPB", "FDIC", "HUD", "HUD",…
$ agency_code [3m[38;5;246m<int>[39m[23m 9, 5, 1, 9, 9, 3, 7, 7, 9, 7, 7, 5, 9, 5, 7, 9, 5, 7, 7, 9, …
$ loan_type_name [3m[38;5;246m<chr>[39m[23m "Conventional", "Conventional", "Conventional", "Conventiona…
$ loan_type [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ property_type_name [3m[38;5;246m<chr>[39m[23m "One-to-four family dwelling (other than manufactured housin…
$ property_type [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ loan_purpose_name [3m[38;5;246m<chr>[39m[23m "Refinancing", "Home improvement", "Home purchase", "Home pu…
$ loan_purpose [3m[38;5;246m<int>[39m[23m 3, 2, 1, 1, 2, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 3, 2, 3, 3, 3, …
$ owner_occupancy_name [3m[38;5;246m<chr>[39m[23m "Owner-occupied as a principal dwelling", "Owner-occupied as…
$ owner_occupancy [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ loan_amount_000s [3m[38;5;246m<int>[39m[23m 376, 80, 98, 157, 5, 7, 270, 55, 124, 96, 215, 100, 57, 209,…
$ preapproval_name [3m[38;5;246m<chr>[39m[23m "Not applicable", "Not applicable", "Preapproval was not req…
$ preapproval [3m[38;5;246m<int>[39m[23m 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
$ action_taken_name [3m[38;5;246m<chr>[39m[23m "Loan originated", "Loan originated", "Application denied by…
$ action_taken [3m[38;5;246m<int>[39m[23m 1, 1, 3, 1, 3, 2, 1, 6, 1, 5, 1, 1, 1, 6, 1, 1, 1, 3, 1, 1, …
$ msamd_name [3m[38;5;246m<chr>[39m[23m "Montgomery County, Bucks County, Chester County - PA", "Scr…
$ msamd [3m[38;5;246m<int>[39m[23m 33874, 42540, 38300, 25420, 37964, 38300, 38300, 38300, 3830…
$ state_name [3m[38;5;246m<chr>[39m[23m "Pennsylvania", "Pennsylvania", "Pennsylvania", "Pennsylvani…
$ state_abbr [3m[38;5;246m<chr>[39m[23m "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", …
$ state_code [3m[38;5;246m<int>[39m[23m 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, …
$ county_name [3m[38;5;246m<chr>[39m[23m "Chester County", "Wyoming County", "Allegheny County", "Cum…
$ county_code [3m[38;5;246m<int>[39m[23m 29, 131, 3, 41, 101, 3, 3, 3, 5, 77, 91, 133, 103, 11, 101, …
$ census_tract_number [3m[38;5;246m<dbl>[39m[23m 3027.03, 4004.00, 4263.00, 102.03, 119.00, 4961.02, 4141.02,…
$ applicant_ethnicity_name [3m[38;5;246m<chr>[39m[23m "Not Hispanic or Latino", "Not Hispanic or Latino", "Informa…
$ applicant_ethnicity [3m[38;5;246m<int>[39m[23m 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ co_applicant_ethnicity_name [3m[38;5;246m<chr>[39m[23m "Not Hispanic or Latino", "No co-applicant", "No co-applican…
$ co_applicant_ethnicity [3m[38;5;246m<int>[39m[23m 2, 5, 5, 2, 5, 2, 1, 5, 2, 2, 2, 2, 2, 2, 5, 2, 2, 5, 5, 2, …
$ applicant_race_name_1 [3m[38;5;246m<chr>[39m[23m "White", "White", "Information not provided by applicant in …
$ applicant_race_1 [3m[38;5;246m<int>[39m[23m 5, 5, 6, 5, 3, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
$ applicant_race_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "White", "", "", "", "", "", "",…
$ applicant_race_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, 5, NA, NA, NA, NA, NA, NA, NA, N…
$ applicant_race_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_4 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_4 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_5 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_5 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_1 [3m[38;5;246m<chr>[39m[23m "White", "No co-applicant", "No co-applicant", "White", "No …
$ co_applicant_race_1 [3m[38;5;246m<int>[39m[23m 5, 8, 8, 5, 8, 5, 5, 8, 5, 5, 5, 5, 5, 5, 8, 5, 5, 8, 8, 5, …
$ co_applicant_race_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_4 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_4 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_5 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_5 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_sex_name [3m[38;5;246m<chr>[39m[23m "Male", "Male", "Information not provided by applicant in ma…
$ applicant_sex [3m[38;5;246m<int>[39m[23m 1, 1, 3, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, …
$ co_applicant_sex_name [3m[38;5;246m<chr>[39m[23m "Female", "No co-applicant", "No co-applicant", "Female", "N…
$ co_applicant_sex [3m[38;5;246m<int>[39m[23m 2, 5, 5, 2, 5, 2, 2, 5, 1, 2, 2, 2, 1, 2, 5, 2, 2, 5, 5, 2, …
$ applicant_income_000s [3m[38;5;246m<int>[39m[23m 137, 28, 36, NA, 36, 41, 77, 29, 130, 180, 119, 102, 40, 101…
$ purchaser_type_name [3m[38;5;246m<chr>[39m[23m "Fannie Mae (FNMA)", "Loan was not originated or was not sol…
$ purchaser_type [3m[38;5;246m<int>[39m[23m 1, 0, 0, 0, 0, 0, 7, 3, 0, 0, 6, 0, 1, 3, 6, 1, 6, 0, 6, 1, …
$ denial_reason_name_1 [3m[38;5;246m<chr>[39m[23m "", "", "Employment history", "", "Credit history", "", "", …
$ denial_reason_1 [3m[38;5;246m<int>[39m[23m NA, NA, 2, NA, 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ denial_reason_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ denial_reason_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ denial_reason_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ denial_reason_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ rate_spread [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ hoepa_status_name [3m[38;5;246m<chr>[39m[23m "Not a HOEPA loan", "Not a HOEPA loan", "Not a HOEPA loan", …
$ hoepa_status [3m[38;5;246m<int>[39m[23m 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ lien_status_name [3m[38;5;246m<chr>[39m[23m "Secured by a first lien", "Secured by a first lien", "Secur…
$ lien_status [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 3, 3, 1, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, …
$ edit_status_name [3m[38;5;246m<chr>[39m[23m "", "", "", "Quality edit failure only", "", "", "", "", "",…
$ edit_status [3m[38;5;246m<int>[39m[23m NA, NA, NA, 6, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ sequence_number [3m[38;5;246m<int>[39m[23m 751759, 74, 386, 11734, 19487, 247, 2160, 27048, 22166, 8854…
$ population [3m[38;5;246m<int>[39m[23m 3859, 4273, 6037, 3437, 5195, 4502, 7040, 5180, 5991, 5795, …
$ minority_population [3m[38;5;246m<dbl>[39m[23m 7.00, 2.27, 2.37, 23.54, 97.56, 4.18, 6.28, 27.72, 1.20, 9.2…
$ hud_median_family_income [3m[38;5;246m<int>[39m[23m 99600, 59000, 69700, 71900, 56200, 69700, 69700, 69700, 6970…
$ tract_to_msamd_income [3m[38;5;246m<dbl>[39m[23m 138.70, 116.80, 116.83, 104.06, 85.75, 108.56, 145.77, 87.41…
$ number_of_owner_occupied_units [3m[38;5;246m<int>[39m[23m 1208, 1296, 2371, 626, 1271, 1636, 2208, 1534, 2123, 2102, 2…
$ number_of_1_to_4_family_units [3m[38;5;246m<int>[39m[23m 1304, 1767, 2361, 824, 2174, 2010, 2489, 2140, 2580, 2312, 2…
$ application_date_indicator [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, …
Now, lets look at the missing values that are present in our data. We go through this in 4 steps. First we look for any NAs, then empty string, NULL values and at last we look for missing values encoded as “?”
writeLines("Checking for missing values with NA")
Checking for missing values with NA
sapply(hmda_data_pa_df, function(x) sum(is.na(x)))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 29409 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
429 966 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
341055 0 341932
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 341985 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
341985 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 341688 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
341973 0 341994
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 341994 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
28066 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 295456 0
denial_reason_2 denial_reason_name_3 denial_reason_3
332369 0 340604
rate_spread hoepa_status_name hoepa_status
336727 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
286304 0 966
minority_population hud_median_family_income tract_to_msamd_income
971 966 1021
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
993 979 0
writeLines("Checking for missing values with empty strings")
Checking for missing values with empty strings
sapply(hmda_data_pa_df, function(x) sum(x == ""))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
29409 NA 0
state_abbr state_code county_name
0 0 429
county_code census_tract_number applicant_ethnicity_name
NA NA 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 341055
applicant_race_2 applicant_race_name_3 applicant_race_3
NA 341932 NA
applicant_race_name_4 applicant_race_4 applicant_race_name_5
341985 NA 341985
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
NA 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
341688 NA 341973
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
NA 341994 NA
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
341994 NA 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
NA 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
295456 NA 332369
denial_reason_2 denial_reason_name_3 denial_reason_3
NA 340604 NA
rate_spread hoepa_status_name hoepa_status
NA 0 0
lien_status_name lien_status edit_status_name
0 0 286304
edit_status sequence_number population
NA 0 NA
minority_population hud_median_family_income tract_to_msamd_income
NA NA NA
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
NA NA 0
writeLines("Checking for missing values with ?")
Checking for missing values with ?
sapply(hmda_data_pa_df, function(x) sum(x == "?"))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 NA 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
NA NA 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
NA 0 NA
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 NA 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
NA 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 NA 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
NA 0 NA
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 NA 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
NA 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 NA 0
denial_reason_2 denial_reason_name_3 denial_reason_3
NA 0 NA
rate_spread hoepa_status_name hoepa_status
NA 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
NA 0 NA
minority_population hud_median_family_income tract_to_msamd_income
NA NA NA
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
NA NA 0
writeLines("Checking for missing values with null")
Checking for missing values with null
sapply(hmda_data_pa_df, function(x) sum(x == NULL))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 0 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
0 0 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
0 0 0
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 0 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
0 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 0 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
0 0 0
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 0 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
0 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 0 0
denial_reason_2 denial_reason_name_3 denial_reason_3
0 0 0
rate_spread hoepa_status_name hoepa_status
0 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
0 0 0
minority_population hud_median_family_income tract_to_msamd_income
0 0 0
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
0 0 0
First, we look at race and ethnicity columns and see what information they provide and how is the distribution per variable.
library(janitor)
writeLines("")
writeLines("Application ethnicity values")
Application ethnicity values
unique(hmda_data_pa_df$applicant_ethnicity_name)
[1] "Not Hispanic or Latino"
[2] "Information not provided by applicant in mail, Internet, or telephone application"
[3] "Not applicable"
[4] "Hispanic or Latino"
writeLines("")
writeLines("Application race name 1 values")
Application race name 1 values
unique(hmda_data_pa_df$applicant_race_1)
[1] 5 6 3 1 7 4 2
unique(hmda_data_pa_df$applicant_race_name_1)
[1] "White"
[2] "Information not provided by applicant in mail, Internet, or telephone application"
[3] "Black or African American"
[4] "American Indian or Alaska Native"
[5] "Not applicable"
[6] "Native Hawaiian or Other Pacific Islander"
[7] "Asian"
Now, lets group the dataframe by ethnicity not Hispanic and print the count according to race.
grouped_by_race_info <- hmda_data_pa_df %>% filter(applicant_ethnicity_name == "Hispanic or Latino") %>%
group_by(applicant_race_name_1) %>%
count() %>%
ungroup() %>%
replace(is.na(.), 0) %>%
adorn_totals(c("col")) %>%
arrange(-Total)
head(grouped_by_race_info)
applicant_race_name_1 n Total
White 6969 6969
Information not provided by applicant in mail, Internet, or telephone application 1231 1231
Black or African American 421 421
Native Hawaiian or Other Pacific Islander 195 195
American Indian or Alaska Native 174 174
Asian 113 113
We do this because we want to merge these two columns into one and deal with it as one single predictor.
hmda_data_pa_df$applicant_race_and_ethnicity <- NA
hmda_data_pa_df$co_applicant_race_and_ethnicity <- NA
hmda_data_pa_df$applicant_race_and_ethnicity <- ifelse(hmda_data_pa_df$applicant_ethnicity_name == "Hispanic or Latino",
"Hispanic or Latino", hmda_data_pa_df$applicant_race_name_1)
hmda_data_pa_df$co_applicant_race_and_ethnicity <- ifelse(hmda_data_pa_df$co_applicant_ethnicity_name == "Hispanic or Latino",
"Hispanic or Latino", hmda_data_pa_df$co_applicant_race_name_1)
writeLines("")
writeLines("Unique values for the applicant_race_and_ethnicity column")
Unique values for the applicant_race_and_ethnicity column
writeLines("")
unique(hmda_data_pa_df$applicant_race_and_ethnicity)
[1] "White"
[2] "Information not provided by applicant in mail, Internet, or telephone application"
[3] "Black or African American"
[4] "American Indian or Alaska Native"
[5] "Not applicable"
[6] "Native Hawaiian or Other Pacific Islander"
[7] "Asian"
[8] "Hispanic or Latino"
head(hmda_data_pa_df)
NA
See how the distroibution is for the loan application according to race and ethnicity. We summarise the count of application according to the applicants race.
mortgage_by_race_and_ethnicity = hmda_data_pa_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(EthnicityCount = n()) %>%
arrange(desc(EthnicityCount))
graph_by_enthicity(mortgage_by_race_and_ethnicity)
Now, lets dive even deeper and see how the actions are taken for application for each race and ethnicity category. # Graph which applicant races and ethnicities have the largest proportion of loans # in various stages. These include origination status, denied status, etc.
mortgage_status_by_race_and_ethnicity <- hmda_data_pa_df %>% group_by(action_taken_name, applicant_race_and_ethnicity) %>%
summarise(ActionCount = n()) %>%
arrange(desc(ActionCount))
mortgage_status_aggregated_by_race_and_ethnicity = inner_join(mortgage_status_by_race_and_ethnicity, mortgage_by_race_and_ethnicity) %>% mutate(percentage = (ActionCount / EthnicityCount) * 100)
Joining, by = "applicant_race_and_ethnicity"
graph_application_race_proportion_of_loans(mortgage_status_aggregated_by_race_and_ethnicity)
hmda_origination_status_df <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1", ]
graph_applicant_income_histogram(hmda_origination_status_df, "Applicant income distribution for originated loans")
The graph above clearly shows that the denial rate is more for minorities, and to be more specific, it is more for African Americans. One more thing to notice is that the category where applicants race is unknown, most of them are purchased by the institution.
Now lets see how the income distriubtion underlies for applicants. Lets see the median income for each category.
hmda_origination_status_df <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1", ]
head(hmda_origination_status_df)
hmda_origination_status_df %>% ggplot(aes(as.numeric(hud_median_family_income))) +
geom_histogram(binwidth = 1000,, fill=c("blue")) + labs(x = "Median Income", y = "Applicant Count", title = "Median Income Distribution for Area for Originated Loans") + theme_bw()
We see that Asians have the largest median income value amongst all. At the bottom, we have African Americans and Hispanic or Latino
mortgage_distribution_by_counties <- hmda_data_pa_df %>%
filter(!is.na(county_name)) %>%
group_by(county_name) %>%
summarise(CountLoans = n() ) %>%
mutate(percentage = ( CountLoans / sum(CountLoans) ) * 100 ) %>%
mutate(county_name = reorder(county_name, percentage)) %>%
arrange(desc(percentage)) %>%
head(20)
graph_distribution_by_county(mortgage_distribution_by_counties)
originated_mortgage_distribution_by_counties <- hmda_origination_status_df %>%
filter(!is.na(county_name)) %>%
group_by(county_name) %>%
summarise(CountLoans = n() ) %>%
mutate(percentage = ( CountLoans / sum(CountLoans) ) *100 ) %>%
mutate(county_name = reorder(county_name, percentage)) %>%
arrange(desc(percentage)) %>%
head(20)
graph_distribution_by_county(originated_mortgage_distribution_by_counties)
county_names <- c("Allegheny County", "Philadelphia County", "Montgomery County", "Bucks County")
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race_county <- hmda_data_county_df %>% group_by(applicant_race_name_1) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
print(graph_mortgage_distribution_by_race1(mortgage_by_race_county))
}
for (county_name in county_names) {
hmda_origination_status_df_by_county_white <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1" & hmda_data_pa_df$county_name == county_name & hmda_data_pa_df$applicant_race_name_1 == "White", ]
print(graph_applicant_income_histogram(hmda_origination_status_df_by_county_white, "Income distribution for Whites"))
hmda_origination_status_df_by_county_african_american <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1" & hmda_data_pa_df$county_name == county_name & hmda_data_pa_df$applicant_race_name_1 == "Black or African American", ]
print(graph_applicant_income_histogram(hmda_origination_status_df_by_county_african_american, "Income distribution for African Americans"))
}
county_names <- c("Allegheny County", "Philadelphia County", "Montgomery County", "Bucks County")
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race_county <- hmda_data_county_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
print(graph_mortgage_distribution_by_race_and_ethnicity(mortgage_by_race_county))
}
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race1_county <- hmda_data_county_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
mortgage_status_by_race1_by_county <- hmda_data_county_df %>% group_by(action_taken_name, applicant_race_and_ethnicity) %>%
summarise(ActionCount = n()) %>%
arrange(desc(ActionCount))
mortgage_status_aggregated_by_race1_by_county = inner_join(mortgage_status_by_race1_by_county, mortgage_by_race1_county) %>% mutate(percentage = (ActionCount / RaceCount) * 100)
print(graph_application_race_and_ethnicity_proportion_of_loans(mortgage_status_aggregated_by_race1_by_county))
}
Joining, by = "applicant_race_and_ethnicity"
Now we start looking at the missing values and see how can we deal with them .So here, we try and vizualize the missing values
In this graph, we see the missing value count for each column and for each category too. There are alot of missing in some columns like co applicant and applicant 2-3-4 race.
# https://www.rdocumentation.org/packages/mice/versions/3.8.0/topics/mice.impute.cart
hmda_data_pa_df_imputed <- mice(hmda_data_pa_df, m=1, maxit=2, meth='cart',seed=500)
hmda_data_pa_df_imputed <- mice::complete(hmda_data_pa_df_imputed)
summary(hmda_data_pa_df_imputed)
gg_miss_upset(hmda_data_pa_df_imputed)
Banks use this a lot of times when they have to look at how much the applicant income is how much loan they have applied to. So its a good variable to give the extra information about the application.
# https://stackoverflow.com/questions/20637360/convert-all-data-frame-character-columns-to-factors
hmda_data_pa_df$loan_to_income_ratio <- hmda_data_pa_df$loan_amount_000s / hmda_data_pa_df$applicant_income_000s
hmda_data_pa_df[sapply(hmda_data_pa_df, is.character)] <- lapply(hmda_data_pa_df[sapply(hmda_data_pa_df, is.character)],
as.factor)
hmda_data_pa_df_for_correlation <- as.data.frame(lapply(hmda_data_pa_df, as.integer))
#head(hmda_data_pa_df_for_correlation[, c("applicant_income_000s", "loan_amount_000s")])
head(hmda_data_pa_df_for_correlation)
corr_simple(hmda_data_pa_df_for_correlation)
corrplot(cor(hmda_data_pa_df_for_correlation[, c("applicant_income_000s", "loan_amount_000s")], use = "na.or.complete"))
# hmda_data_pa_df_imputed <- hmda_data_pa_df;
# https://stackoverflow.com/questions/20637360/convert-all-data-frame-character-columns-to-factors
hmda_data_pa_df_imputed$loan_to_income_ratio <- hmda_data_pa_df_imputed$loan_amount_000s / hmda_data_pa_df_imputed$applicant_income_000s
hmda_data_pa_df_imputed[sapply(hmda_data_pa_df_imputed, is.character)] <- lapply(hmda_data_pa_df_imputed[sapply(hmda_data_pa_df_imputed, is.character)],
as.factor)
hmda_data_pa_df_imputed_for_correlation <- as.data.frame(lapply(hmda_data_pa_df_imputed, as.integer))
head(hmda_data_pa_df_imputed_for_correlation[, c("applicant_income_000s", "loan_amount_000s")])
corr_simple(hmda_data_pa_df_imputed_for_correlation)
corrplot(cor(hmda_data_pa_df_imputed_for_correlation[, c("applicant_income_000s", "loan_amount_000s")], use = "na.or.complete"))
hmda_model_df <- hmda_data_frame_for_model(hmda_data_pa_df_imputed)
hmda_model_df <- process_model_df_columns(hmda_model_df)
l <- ggplot(hmda_model_df, aes(applicant_race_and_ethnicity,fill = loan_granted))
l <- l + geom_histogram(stat="count") + coord_flip()
print(l)
l <- ggplot(hmda_model_df, aes(loan_purpose, fill = loan_granted))
l <- l + geom_histogram(stat="count") + coord_flip()
print(l)
plot(hmda_model_df$loan_granted, main="Loan granted Variable",
col=colors()[100:102],
xlab="Loan distribution")
skew <- paste("Skewness:",skewness(hmda_model_df$loan_amount_000s,na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = loan_amount_000s)) + geom_histogram(fill = "steelblue") + labs(title = "Loan amount distribution" , x = "Loan amount in thousands" , y = "Count")+ annotate("text", x = 100000, y = 300000, size = 3.2,label = skew)
Looks like the data is highly skewed.
#install.packages("moments")
library(moments)
skewness(hmda_model_df$loan_amount_000s,na.rm = TRUE)
The data for loan amount is highly right skewed. Changes should be made so that the prediction model does not mess up.
skew <- paste("Skewness:",skewness(log(hmda_model_df$loan_amount_000s),na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = log(loan_amount_000s))) + geom_histogram(fill = "steelblue") + labs(title = "Log transformed distribution for Loan amount" , x = "log(Loan Amount)", y = 'Count')+ annotate("text", x = 8, y = 100000, size = 3.2,label = skew)
skewness(log(hmda_model_df$loan_amount_000s),na.rm = TRUE)
boxplot(log(hmda_model_df$loan_amount_000s),col = colors()[100:109],
main = "Boxplot of Log of Loan Amounts",
xlab="Loan Amount",
ylab="Distribution of Log of Loan Amounts")
skew <- paste("Skewness:",skewness(hmda_model_df$applicant_income_000s,na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = applicant_income_000s)) + geom_histogram(fill = "steelblue") + labs(title = "Applicant Income distribution" , x = "Applicant Income in thousands" , y = "Count") + annotate("text", x = 100000, y = 90000, size = 3.2,label = skew)
skew <- paste("Skewness:",skewness(log(hmda_model_df$applicant_income_000s),na.rm=TRUE))
ggplot(data = hmda_model_df , aes(x = log(applicant_income_000s))) + geom_histogram(fill = "steelblue") + labs(title = "Log transformed distribution for Applicant Income" , x = "log(Applicant Income)", y = 'Count') +annotate("text", x = 10, y = 90000, size = 3.2,label = skew)
boxplot(log(loan_amount_000s)~loan_granted, xlab="Loan decision",ylab="Log of Loan Amounts",col=c("pink","lightblue"),
main="Exploratory Data Analysis Plot\n of Loan Decision Versus Log of Loan Amounts", data = hmda_model_df)
boxplot(log(applicant_income_000s)~loan_granted, xlab="Loan decision",ylab="Log of Applicant Income",col=c("pink","lightblue"),
main="Exploratory Data Analysis Plot\n of Loan Decision Versus Log of Applicant Income", data = hmda_model_df)
ggplot(hmda_model_df, aes(log(applicant_income_000s), applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Log of Applicant income vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
ggplot(hmda_model_df, aes(log(loan_amount_000s), applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Log of loan amount vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
ggplot(hmda_model_df, aes(loan_to_income_ratio, applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Loan to Income ratio vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
write.csv(hmda_data_pa_df_imputed, paste(data_dir, "/2015/hmda_2015_pa_imputed.csv", sep = ""), row.names = FALSE)